#!/usr/bin/env python3
"""
================================================================================
NWformat6502.py  —  6502 Assembly Language Source File Formatter
================================================================================

Written by Claude Opus 4.6 Extended
Prompts by J R Casey Bralla
https://www.NerdWorld.org


PURPOSE:
    Reads a 6502 assembly source file (*.asm) and reformats it according to a
    consistent column-alignment scheme, normalises hex casing, cleans up
    whitespace, and writes the result back — after first archiving the original.
    Also generates a symbol cross-reference table (.sym) listing every symbol,
    the line where it is defined, and every line where it is referenced.


FORMATTING RULES APPLIED:
    1. Replace every tab character with spaces.
    2. Strip trailing whitespace from every line.
    3. Ensure at least one space between ";" and the first comment character.
    4. Convert hexadecimal digits in $xxxx / #xx literals to UPPER CASE.
    5. Column alignment (1-indexed):
         Column  1  — Labels and full-line comments (lines starting with ";").
         Column 16  — Instructions and assembler directives (starting with ".").
         Column 32  — Trailing (inline) comments (starting with ";").
    6. Archive the original file by appending
       "Previous_from_YYYY-MM-DD_HH-MM-SS" before the extension.
    7. Save the newly formatted content under the original filename.
    8. Generate a symbol cross-reference table saved as <filename>.sym.

USAGE:
    python format_6502.py <filename.asm>

OUTPUTS:
    <filename>.asm                                    — Formatted source (in place)
    <filename>.Previous_from_YYYY-MM-DD_HH-MM-SS.asm  — Backup of original
    <filename>.sym                                    — Symbol cross-reference table

REVISION HISTORY:
    Rev  Date        Author          Description
    ---  ----------  --------------  -------------------------------------------
    1.0  2026-03-17  Claude/Anthr.   Initial release — full formatting pipeline
                                     with tab replacement, trailing-space removal,
                                     comment spacing, hex upper-casing, three-
                                     column alignment, and timestamped backup.
    2.0  2026-03-17  Claude/Anthr.   Added symbol cross-reference table feature.
                                     Two-pass analysis identifies every symbol
                                     definition and all code references.  Output
                                     saved as a companion .sym file alongside the
                                     formatted source.  Added known-mnemonics and
                                     known-directives sets so that opcodes like
                                     "LDA" and directives like ".equ" are never
                                     mistaken for symbol references.
    2.1  2026-03-18  Claude/Anthr.   Single-quoted strings now fully supported
                                     alongside double-quoted strings.  Semicolons
                                     and other special characters inside any
                                     quoted string are no longer misinterpreted
                                     as comment delimiters.  Hex upper-casing and
                                     symbol extraction also respect single quotes.
================================================================================
"""

# =============================================================================
# CHANGE LOG
# =============================================================================
# 2026-03-17  v1.0  — Initial version.
#   • Implements all seven formatting rules.
#   • Handles quoted strings so that text inside ".text" literals is never
#     accidentally modified (hex casing, comment detection, etc.).
#   • Backs up the original file with a local-time timestamp before overwriting.
#   • Extensively commented for readability and future maintenance.
#
# 2026-03-17  v2.0  — Symbol cross-reference table.
#   • NEW: Two-pass symbol analysis (collect definitions, then scan references).
#   • NEW: Generates a .sym file with definition line and all reference lines
#     for every symbol found in the source.
#   • NEW: Console summary of the cross-reference table.
#   • Recognises 6502/65C02 mnemonics and assembler directives so they are
#     excluded from the symbol table (they are opcodes, not user symbols).
#   • Handles case-insensitive matching and whole-word boundaries.
#   • Protects quoted strings and comments from false-positive matches.
#
# 2026-03-18  v2.1  — Single-quote support; robust semicolon handling.
#   • find_comment_start() now tracks both single and double quotes via a
#     while-loop that properly skips backslash-escaped characters.
#   • apply_hex_uppercase() splits on both "..." and '...' so hex digits
#     inside any quoted string are never uppercased.
#   • extract_code_tokens() strips both quote styles before scanning for
#     symbol references, preventing false positives from quoted content.
# =============================================================================

import sys          # Command-line argument handling
import os           # File-path manipulation
import re           # Regular expressions for hex pattern matching
import shutil       # File copy for creating the backup
from datetime import datetime   # Timestamping the backup filename


# ─── version info ────────────────────────────────────────────────────────────
__version__ = "2.1"
__date__    = "2026-03-18"


# ─── column constants (0-indexed internally; the spec uses 1-indexed) ────────
LABEL_COL       = 0     # Column  1 in the spec  →  index 0
INSTRUCTION_COL = 15    # Column 16 in the spec  →  index 15
COMMENT_COL     = 31    # Column 32 in the spec  →  index 31


# ─── 6502 / 65C02 mnemonics (not user symbols) ──────────────────────────────
# This comprehensive set prevents opcodes from being reported as symbols in
# the cross-reference table.  Includes all NMOS 6502, CMOS 65C02, Rockwell
# bit-manipulation (RMBn, SMBn, BBRn, BBSn), and WDC extensions (WAI, STP).
KNOWN_MNEMONICS = {
    # --- Standard 6502 mnemonics ---
    "ADC", "AND", "ASL", "BCC", "BCS", "BEQ", "BIT", "BMI", "BNE", "BPL",
    "BRK", "BVC", "BVS", "CLC", "CLD", "CLI", "CLV", "CMP", "CPX", "CPY",
    "DEC", "DEX", "DEY", "EOR", "INC", "INX", "INY", "JMP", "JSR", "LDA",
    "LDX", "LDY", "LSR", "NOP", "ORA", "PHA", "PHP", "PLA", "PLP", "ROL",
    "ROR", "RTI", "RTS", "SBC", "SEC", "SED", "SEI", "STA", "STX", "STY",
    "TAX", "TAY", "TSX", "TXA", "TXS", "TYA",
    # --- 65C02 additions ---
    "BRA", "PHX", "PHY", "PLX", "PLY", "STZ", "TRB", "TSB", "WAI", "STP",
    # --- Rockwell bit-manipulation ---
    "RMB0", "RMB1", "RMB2", "RMB3", "RMB4", "RMB5", "RMB6", "RMB7",
    "SMB0", "SMB1", "SMB2", "SMB3", "SMB4", "SMB5", "SMB6", "SMB7",
    "BBR0", "BBR1", "BBR2", "BBR3", "BBR4", "BBR5", "BBR6", "BBR7",
    "BBS0", "BBS1", "BBS2", "BBS3", "BBS4", "BBS5", "BBS6", "BBS7",
}

# ─── assembler directives (not user symbols) ────────────────────────────────
# These are assembler pseudo-ops that should never appear in the symbol table.
KNOWN_DIRECTIVES = {
    ".ORG", ".BYTE", ".WORD", ".EQU", ".TEXT", ".ASCIIZ",
    ".DS", ".FILL", ".INCLUDE",
}

# ─── register names (not user symbols) ──────────────────────────────────────
# Single-letter register operands that must not be confused with symbols.
REGISTER_NAMES = {"A", "X", "Y", "S"}


# =============================================================================
# HELPER FUNCTIONS  —  FORMATTING
# =============================================================================

def find_comment_start(line: str) -> int:
    """
    Return the index of the first ';' that is NOT inside a quoted string,
    or -1 if there is no such semicolon.

    We walk character-by-character, tracking whether we are inside a
    single-quoted ('...') or double-quoted ("...") string.  Backslash
    escape sequences inside quotes are handled so that \\' and \\" do not
    prematurely close the string.

    Parameters
    ----------
    line : str
        One line of source code (tabs already replaced, trailing space stripped).

    Returns
    -------
    int
        0-based index of the semicolon, or -1.
    """
    in_quote = None             # None, '"', or "'"
    i = 0
    while i < len(line):
        ch = line[i]
        if in_quote is None:
            if ch in ('"', "'"):
                in_quote = ch
            elif ch == ';':
                return i                # Found a comment outside quotes
        else:
            if ch == '\\' and i + 1 < len(line):
                i += 2                  # Skip backslash AND escaped char
                continue
            if ch == in_quote:
                in_quote = None         # Closing quote
        i += 1
    return -1                           # No comment found


def uppercase_hex_in_text(text: str) -> str:
    """
    Convert the hex-digit portion of any hexadecimal literal to UPPER CASE.

    Recognised patterns (case-insensitive on input):
        $FF0A   — Motorola / 6502 hex prefix
        #$FF    — Immediate hex operand
        #0A     — Bare immediate operand (if all hex digits)

    The '$' or '#' prefix itself is left untouched; only [0-9a-fA-F] runs
    are uppercased.

    IMPORTANT — this function must NOT be called on text that is inside
    double-quoted string literals.  The caller is responsible for splitting
    quoted regions out before invoking this helper.

    Parameters
    ----------
    text : str
        A fragment of source code that contains NO quoted strings.

    Returns
    -------
    str
        The same text with hex digits uppercased.
    """
    # Pattern: a '$' (or '#$') followed by one or more hex digits.
    # We capture the prefix and the hex body separately so we can uppercase
    # only the body.
    #   Group 1  →  the prefix   ('$' or '#$' or '#')
    #   Group 2  →  hex digits
    def _upper_hex(match: re.Match) -> str:
        """Callback: return prefix unchanged + hex digits uppercased."""
        return match.group(1) + match.group(2).upper()

    # Match '$' prefixed hex (with optional '#' before '$')
    text = re.sub(r'(\#?\$)([0-9A-Fa-f]+)', _upper_hex, text)

    return text


def apply_hex_uppercase(line: str) -> str:
    """
    Apply hex upper-casing to a full source line while protecting quoted
    string content from modification.

    Strategy:
        1. Split the line into alternating unquoted / quoted segments.
        2. Apply uppercase_hex_in_text() only to unquoted segments.
        3. Rejoin all segments.

    Parameters
    ----------
    line : str
        A full line of assembly source.

    Returns
    -------
    str
        The line with hex digits uppercased (outside of strings).
    """
    # Split on double-quoted AND single-quoted strings, keeping the delimiters.
    # Example:  '.byte "HELLO",';'  ; comment $ff'
    #   → ['.byte ', '"HELLO"', ',', "';'", '  ; comment $ff']
    parts = re.split(r'(".*?"|\'.*?\')', line)

    result_parts = []
    for i, part in enumerate(parts):
        if ((part.startswith('"') and part.endswith('"')) or
                (part.startswith("'") and part.endswith("'"))):
            # This segment is a quoted string — leave it untouched.
            result_parts.append(part)
        else:
            # This segment is code / comment — uppercase hex digits.
            result_parts.append(uppercase_hex_in_text(part))
    return ''.join(result_parts)


def ensure_comment_space(comment: str) -> str:
    """
    Ensure there is at least one space between the leading ';' and the first
    non-space character of the comment text.

    Examples:
        ";HELLO"   →  "; HELLO"
        "; HELLO"  →  "; HELLO"   (already has a space — no change)
        ";"        →  ";"         (bare semicolon, nothing to space)

    Parameters
    ----------
    comment : str
        The comment portion of a line, starting with ';'.

    Returns
    -------
    str
        The comment with guaranteed spacing.
    """
    if len(comment) <= 1:
        # Bare ';' with nothing after it — return as-is.
        return comment

    # Check the character immediately after the ';'.
    if comment[1] != ' ':
        # Insert one space between ';' and the text.
        return '; ' + comment[1:]

    # Already has at least one space — return unchanged.
    return comment


def parse_line(line: str):
    """
    Decompose a single source line into its three logical fields:

        label       — text that starts in column 1 (may be empty)
        instruction — mnemonic / directive + operand (may be empty)
        comment     — trailing ';' comment (may be empty)

    Parameters
    ----------
    line : str
        The raw source line (tabs already expanded, trailing spaces stripped).

    Returns
    -------
    tuple of (str, str, str)
        (label, instruction, comment)
        Any of the three may be an empty string if the field is absent.
    """
    # ── blank line ───────────────────────────────────────────────────────
    if not line.strip():
        return ('', '', '')

    # ── full-line comment (first non-space char is ';') ──────────────────
    stripped = line.lstrip()
    if stripped.startswith(';'):
        # The entire line is a comment — it belongs in the "label / comment"
        # column at position 1.  Return it as the 'label' field so the
        # formatter places it at column 1.
        return (stripped, '', '')

    # ── locate trailing comment (';' outside quotes) ─────────────────────
    comment_idx = find_comment_start(line)
    if comment_idx >= 0:
        comment  = line[comment_idx:].strip()   # e.g. "; some note"
        code_part = line[:comment_idx].rstrip()  # everything before the ';'
    else:
        comment   = ''
        code_part = line.rstrip()

    # ── split code_part into label and instruction ───────────────────────
    # A label starts at column 1 (no leading whitespace).
    # An instruction is preceded by at least one space.
    label = ''
    instruction = ''

    if code_part and not code_part[0].isspace():
        # Line starts in column 1 — there IS a label.
        # The label extends until the first whitespace.
        # However, some labels include '-' (e.g. "CPIY-1"), so we split
        # on whitespace rather than on non-alphanumeric characters.
        tokens = code_part.split(None, 1)   # Split into at most 2 parts
        label = tokens[0]
        if len(tokens) > 1:
            instruction = tokens[1].strip()
        # else: label-only line (no instruction)
    else:
        # Line starts with whitespace — no label; everything is instruction.
        instruction = code_part.strip()

    return (label, instruction, comment)


def format_line(label: str, instruction: str, comment: str) -> str:
    """
    Reassemble the three fields into one properly column-aligned line.

    Column layout (1-indexed per the spec, 0-indexed here):
        Col  1 (idx  0): Label or full-line comment
        Col 16 (idx 15): Instruction / directive
        Col 32 (idx 31): Trailing comment

    If the label is long enough to collide with the instruction column, we
    insert exactly one space between them rather than overlap.  Likewise for
    the instruction overflowing into the comment column.

    Parameters
    ----------
    label : str
        The label text (or a full-line comment).  May be ''.
    instruction : str
        The mnemonic/directive + operand.  May be ''.
    comment : str
        The trailing comment (including the leading ';').  May be ''.

    Returns
    -------
    str
        A single formatted line (no trailing whitespace, no newline).
    """
    # ── blank line (all fields empty) ────────────────────────────────────
    if not label and not instruction and not comment:
        return ''

    # ── full-line comment (label field holds the comment, others empty) ──
    if label.startswith(';') and not instruction and not comment:
        return label

    # ── label only (no instruction, no comment) ─────────────────────────
    if label and not instruction and not comment:
        return label

    # ── build the line left to right ────────────────────────────────────
    line = ''

    # Start with the label at column 1 (index 0).
    if label:
        line = label

    # Pad (or add one space) up to the instruction column.
    if instruction:
        if len(line) < INSTRUCTION_COL:
            # Pad with spaces to reach column 16.
            line = line.ljust(INSTRUCTION_COL)
        else:
            # Label is too long — just add one space separator.
            line += ' '
        line += instruction

    # Pad (or add one space) up to the comment column.
    if comment:
        if not line:
            # Edge case: comment-only on an "instruction" line (shouldn't
            # normally happen after parsing, but handle it defensively).
            line = ''.ljust(INSTRUCTION_COL) + comment
        elif len(line) < COMMENT_COL:
            # Pad with spaces to reach column 32.
            line = line.ljust(COMMENT_COL)
        else:
            # Instruction is too long — just add one space separator.
            line += ' '
        line += comment

    return line


def create_backup(filepath: str) -> str:
    """
    Create a timestamped backup of the original file.

    The backup name is formed by inserting
        "Previous_from_YYYY-MM-DD_HH-MM-SS"
    between the stem and the extension of the original filename.

    Example:
        AIM-65.asm  →  AIM-65.Previous_from_2026-03-17_14-30-05.asm

    Parameters
    ----------
    filepath : str
        Path to the original file.

    Returns
    -------
    str
        Path to the newly created backup file.
    """
    # Decompose the path into directory, stem, and extension.
    directory = os.path.dirname(filepath) or '.'
    basename  = os.path.basename(filepath)
    stem, ext = os.path.splitext(basename)

    # Build the timestamp string using LOCAL time.
    now = datetime.now()        # Local time (not UTC)
    timestamp = now.strftime("%Y-%m-%d_%H-%M-%S")

    # Assemble the backup filename.
    backup_name = f"{stem}.Previous_from_{timestamp}{ext}"
    backup_path = os.path.join(directory, backup_name)

    # Copy the original to the backup location (preserves metadata).
    shutil.copy2(filepath, backup_path)

    return backup_path


# =============================================================================
# HELPER FUNCTIONS  —  SYMBOL TABLE / CROSS-REFERENCE
# =============================================================================

def extract_code_tokens(line: str) -> list:
    """
    Extract all identifier-like tokens from the CODE portion of a source line,
    excluding the label (column 1), comments, and quoted strings.

    This is used to find symbol references in instructions and operands.
    For example, from the line:
        Start          JSR CRLOW       ; Clear the display
    we want to extract the operand tokens: ["JSR", "CRLOW"]
    (The mnemonic "JSR" is filtered out later by the caller using is_symbol_name.)

    From a line like:
        LDIY           .equ CPIY
    we want: [".equ", "CPIY"]  (the directive ".equ" is filtered later.)

    Parameters
    ----------
    line : str
        A raw source line (tabs already expanded, trailing spaces stripped).

    Returns
    -------
    list of str
        Identifier tokens found in the instruction / operand area.
        Returned in their original case as found in the source.
    """
    # ── Step 1: Remove trailing comment (outside quotes) ─────────────────
    comment_idx = find_comment_start(line)
    if comment_idx >= 0:
        code_part = line[:comment_idx]
    else:
        code_part = line

    # ── Step 2: Remove the label (first token if it starts in column 1) ──
    # A label is the first token on a line that has NO leading whitespace.
    if code_part and not code_part[0].isspace():
        tokens = code_part.split(None, 1)
        # The rest after the label is the instruction area.
        code_part = tokens[1] if len(tokens) > 1 else ''

    # ── Step 3: Remove quoted strings to prevent false matches ───────────
    # Replace "..." and '...' with empty strings so tokens inside quotes
    # are not found.
    code_part = re.sub(r'".*?"', '', code_part)
    code_part = re.sub(r"'.*?'", '', code_part)

    # ── Step 4: Extract all word-like tokens ─────────────────────────────
    # We use a regex that matches identifiers: letters, digits, underscores,
    # dots (for directives / local labels), and '@' (for local labels).
    # Hyphens are included for labels like "CPIY-1".
    # The leading characters $.#&!% are numeric-literal prefixes and are
    # excluded so we don't capture hex literals as symbols.
    raw_tokens = re.findall(r'[A-Za-z_.@][A-Za-z0-9_.@-]*', code_part)

    return raw_tokens


def is_symbol_name(token: str) -> bool:
    """
    Determine whether a token looks like a user-defined symbol name
    (as opposed to a mnemonic, directive, register, or numeric literal).

    A token is considered a symbol name if:
      - It is NOT a known 6502/65C02 mnemonic (e.g., LDA, JSR).
      - It is NOT a known assembler directive (e.g., .equ, .byte).
      - It is NOT a register name (A, X, Y, S).
      - It does NOT look like a pure hex literal (all hex digits, ≤4 chars).

    Parameters
    ----------
    token : str
        A token extracted from the instruction / operand area.

    Returns
    -------
    bool
        True if the token appears to be a user-defined symbol name.
    """
    upper = token.upper()

    # Exclude known mnemonics (opcodes).
    if upper in KNOWN_MNEMONICS:
        return False

    # Exclude known directives.
    if upper in KNOWN_DIRECTIVES:
        return False

    # Exclude single-letter register names.
    if upper in REGISTER_NAMES:
        return False

    # Exclude pure numeric literals (all hex digits, short length).
    # Labels must contain at least one letter outside the hex range (G-Z)
    # or start with a non-hex character to be recognised as symbols.
    # Short tokens of pure hex digits (≤4 chars) are likely bare hex values.
    if re.fullmatch(r'[0-9A-Fa-f]+', token) and len(token) <= 4:
        return False

    return True


def build_symbol_table(lines: list) -> dict:
    """
    Build a symbol cross-reference table from the source lines.

    This is a TWO-PASS process over the source:

    Pass A — Collect Definitions:
        Scan every line for labels in column 1.  A label defines a symbol.
        Record the line number where each symbol is defined.

    Pass B — Collect References:
        For every line, extract the code tokens from the instruction / operand
        area.  If any token matches a known symbol (case-insensitive), record
        that line number as a reference — but only if it is NOT the definition
        line for that same symbol.

    Parameters
    ----------
    lines : list of str
        The raw source lines (before formatting), one string per line.

    Returns
    -------
    dict
        A dictionary keyed by symbol name (UPPER CASE) with values that are
        dicts containing:
            'original'   : str  — the symbol name as it first appeared
            'defined'    : int  — the line number where it was defined (1-based)
            'references' : list of int — sorted list of line numbers where
                           the symbol is referenced (excluding the def line)
    """
    # ─── Data structure to accumulate results ────────────────────────────
    # Key: symbol name in UPPER CASE
    # Value: { 'original': str, 'defined': int, 'references': list[int] }
    sym_table = {}

    # =====================================================================
    # PASS A  —  Collect all symbol definitions
    # =====================================================================
    for line_num, raw_line in enumerate(lines, start=1):

        # Expand tabs and strip trailing whitespace (same as the formatter).
        line = raw_line.expandtabs(8).rstrip()

        # Skip blank lines.
        if not line.strip():
            continue

        # Skip full-line comments (first non-space char is ';').
        stripped = line.lstrip()
        if stripped.startswith(';'):
            continue

        # A label starts in column 1 (no leading whitespace).
        if line and not line[0].isspace():
            # Extract the label (first whitespace-delimited token).
            tokens = line.split(None, 1)
            label = tokens[0]

            # Strip an optional trailing colon (e.g. "Loop:").
            if label.endswith(':'):
                label = label[:-1]

            # Normalise to upper case for the dictionary key.
            key = label.upper()

            # Record this definition.  If a symbol is defined more than once
            # (duplicate label), we keep the FIRST definition and note
            # subsequent ones as references (the assembler warns about this).
            if key not in sym_table:
                sym_table[key] = {
                    'original':   label,        # Preserve original case
                    'defined':    line_num,      # Line where defined
                    'references': [],            # Will be populated in Pass B
                }
            else:
                # Duplicate definition — record this line as a redefinition.
                # We add it to references so it shows up in the xref.
                if line_num not in sym_table[key]['references']:
                    sym_table[key]['references'].append(line_num)

    # =====================================================================
    # PASS B  —  Collect all symbol references
    # =====================================================================
    for line_num, raw_line in enumerate(lines, start=1):

        # Expand tabs and strip trailing whitespace.
        line = raw_line.expandtabs(8).rstrip()

        # Skip blank lines.
        if not line.strip():
            continue

        # Extract tokens from the instruction / operand area of this line.
        # This excludes the label in column 1, comments, and quoted strings.
        code_tokens = extract_code_tokens(line)

        # Check each token against the known symbol table.
        for token in code_tokens:
            # Filter out mnemonics, directives, registers, and bare hex.
            if not is_symbol_name(token):
                continue

            key = token.upper()

            # Is this token a known symbol?
            if key in sym_table:
                # Record this line as a reference, but NOT if it is the
                # definition line itself (that would be redundant).
                if line_num != sym_table[key]['defined']:
                    if line_num not in sym_table[key]['references']:
                        sym_table[key]['references'].append(line_num)

    # Sort reference lists for clean output.
    for entry in sym_table.values():
        entry['references'].sort()

    return sym_table


def write_symbol_file(sym_table: dict, sym_path: str, source_name: str) -> None:
    """
    Write the symbol cross-reference table to a .sym file.

    The output format is a neatly columned report:
        - Header with the source filename and generation timestamp.
        - One row per symbol, sorted alphabetically, showing:
            • Symbol name
            • Line where defined
            • All lines where referenced (comma-separated)
        - Footer with symbol count summary.

    Parameters
    ----------
    sym_table : dict
        The cross-reference table as returned by build_symbol_table().
    sym_path : str
        Output file path for the .sym file.
    source_name : str
        The source filename (for the report header).
    """
    # Determine the widest symbol name for column alignment.
    if sym_table:
        max_name_len = max(len(e['original']) for e in sym_table.values())
        max_name_len = max(max_name_len, 6)     # Minimum width for "Symbol"
    else:
        max_name_len = 6

    # ── Column widths ────────────────────────────────────────────────────
    name_width = max_name_len + 2       # Symbol name + padding
    def_width  = 10                     # "Defined" column

    # ── Separator line ───────────────────────────────────────────────────
    separator = "=" * 78

    # ── Write the file ───────────────────────────────────────────────────
    with open(sym_path, 'w', encoding='utf-8') as f:
        # ── Header ───────────────────────────────────────────────────────
        f.write(f"{separator}\n")
        f.write(f"  SYMBOL CROSS-REFERENCE TABLE\n")
        f.write(f"  Source: {source_name}\n")
        f.write(f"  Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
        f.write(f"  Formatter: format_6502.py v{__version__}\n")
        f.write(f"{separator}\n\n")

        # ── Column headings ──────────────────────────────────────────────
        heading = (f"  {'Symbol':<{name_width}}"
                   f"{'Defined':>{def_width}}"
                   f"    Referenced on Line(s)")
        f.write(f"{heading}\n")
        f.write(f"  {'-' * (name_width - 1)} "
                f"{'-' * (def_width - 1)}"
                f"  {'-' * 42}\n")

        # ── Symbol rows (sorted alphabetically) ─────────────────────────
        if sym_table:
            for key in sorted(sym_table.keys()):
                entry = sym_table[key]
                name    = entry['original']
                defined = entry['defined']
                refs    = entry['references']

                # Format the reference list as comma-separated line numbers.
                if refs:
                    ref_str = ', '.join(str(r) for r in refs)
                else:
                    ref_str = "(no references)"

                f.write(f"  {name:<{name_width}}"
                        f"{defined:>{def_width}}"
                        f"    {ref_str}\n")
        else:
            f.write("  (no symbols defined)\n")

        # ── Footer / Summary ─────────────────────────────────────────────
        total_symbols = len(sym_table)
        referenced    = sum(1 for e in sym_table.values() if e['references'])
        unreferenced  = total_symbols - referenced

        f.write(f"\n{separator}\n")
        f.write(f"  Total symbols defined : {total_symbols}\n")
        f.write(f"  Symbols referenced    : {referenced}\n")
        f.write(f"  Symbols unreferenced  : {unreferenced}\n")
        f.write(f"{separator}\n")


def print_symbol_summary(sym_table: dict) -> None:
    """
    Print a condensed symbol cross-reference summary to the console.

    Parameters
    ----------
    sym_table : dict
        The cross-reference table as returned by build_symbol_table().
    """
    total = len(sym_table)
    if total == 0:
        print("  No symbols found.")
        return

    # Determine the widest symbol name for aligned console output.
    max_name = max(len(e['original']) for e in sym_table.values())
    max_name = max(max_name, 6)

    print(f"  {'Symbol':<{max_name + 2}}{'  Defined':>5}  {'Refs':>5}  Reference Lines")
    print(f"  {'-' * max_name}  {'-' * 5}  {'-' * 5}  {'-' * 40}")

    for key in sorted(sym_table.keys()):
        entry = sym_table[key]
        name    = entry['original']
        defined = entry['defined']
        refs    = entry['references']
        n_refs  = len(refs)

        if refs:
            # Show first few references on the console (truncate if many).
            if len(refs) <= 6:
                ref_str = ', '.join(str(r) for r in refs)
            else:
                shown = ', '.join(str(r) for r in refs[:5])
                ref_str = f"{shown}, ... (+{len(refs) - 5} more)"
        else:
            ref_str = "(none)"

        print(f"  {name:<{max_name + 2}}{defined:>5}  {n_refs:>5}  {ref_str}")

    # Counts.
    referenced   = sum(1 for e in sym_table.values() if e['references'])
    unreferenced = total - referenced
    print()
    print(f"  {total} symbol(s): {referenced} referenced, "
          f"{unreferenced} unreferenced.")


# =============================================================================
# MAIN PROCESSING PIPELINE
# =============================================================================

def format_file(filepath: str) -> None:
    """
    Master function that orchestrates the entire formatting pipeline.

    Steps:
        1. Read the original source file.
        2. For each line:
           a. Replace tabs with spaces.
           b. Strip trailing whitespace.
           c. Apply hex upper-casing (outside quoted strings).
           d. Parse into (label, instruction, comment).
           e. Ensure comment spacing.
           f. Reassemble with correct column alignment.
        3. Build the symbol cross-reference table.
        4. Create a timestamped backup of the original file.
        5. Write the formatted lines back to the original filename.
        6. Write the symbol cross-reference table to a .sym file.

    Parameters
    ----------
    filepath : str
        Path to the *.asm file to format.
    """
    # ── Step 1: Read the original file ───────────────────────────────────
    with open(filepath, 'r', encoding='utf-8') as f:
        original_lines = f.readlines()

    print(f"Read {len(original_lines)} lines from '{filepath}'.")

    # ── Step 2: Process each line ────────────────────────────────────────
    formatted_lines = []
    for line_num, raw_line in enumerate(original_lines, start=1):

        # 2a. Replace tabs with spaces (use 8-space tab stops, the common
        #     default for most editors and terminals).
        line = raw_line.expandtabs(8)

        # 2b. Strip trailing whitespace (including the newline).
        line = line.rstrip()

        # 2c. Convert hex digits to upper case (protecting quoted strings).
        line = apply_hex_uppercase(line)

        # 2d. Parse the line into its three logical fields.
        label, instruction, comment = parse_line(line)

        # 2e. Ensure at least one space after ';' in every comment.
        if comment:
            comment = ensure_comment_space(comment)
        if label.startswith(';'):
            label = ensure_comment_space(label)

        # 2f. Reassemble the line with correct column alignment.
        formatted = format_line(label, instruction, comment)

        formatted_lines.append(formatted)

    # ── Step 3: Build the symbol cross-reference table ───────────────────
    # We run the symbol analysis on the ORIGINAL lines so that line numbers
    # in the cross-reference match the file the user is familiar with.
    # (Line numbers are identical between original and formatted since the
    # formatter is a 1-to-1 line transformation.)
    print()
    print("Building symbol cross-reference table...")
    sym_table = build_symbol_table(original_lines)

    # Print a summary to the console.
    print_symbol_summary(sym_table)

    # ── Step 4: Create a timestamped backup of the original file ─────────
    print()
    backup_path = create_backup(filepath)
    print(f"Backup created: '{backup_path}'")

    # ── Step 5: Write the formatted content to the original filename ─────
    with open(filepath, 'w', encoding='utf-8') as f:
        for line in formatted_lines:
            f.write(line + '\n')

    print(f"Formatted file saved as '{filepath}'.")
    print(f"Total lines processed: {len(formatted_lines)}")

    # ── Step 6: Write the symbol cross-reference table ───────────────────
    # Derive the .sym filename from the source filename.
    stem, _ext = os.path.splitext(filepath)
    sym_path = stem + '.sym'
    write_symbol_file(sym_table, sym_path, os.path.basename(filepath))
    print(f"Symbol table saved as '{sym_path}'.")


# =============================================================================
# ENTRY POINT
# =============================================================================

def main():
    """
    Parse command-line arguments and invoke the formatter.

    Usage:
        python format_6502.py <filename.asm>
    """
    # ── print banner ─────────────────────────────────────────────────────
    print(f"═══ 6502 Assembly Formatter  v{__version__}  ({__date__}) ═══")
    print()

    # ── validate arguments ───────────────────────────────────────────────
    if len(sys.argv) != 2:
        print("Usage: python format_6502.py <filename.asm>")
        print("       Formats a 6502 assembly source file in place.")
        sys.exit(1)

    filepath = sys.argv[1]

    # Check that the file exists and is readable.
    if not os.path.isfile(filepath):
        print(f"Error: File '{filepath}' not found.")
        sys.exit(1)

    # ── run the formatter ────────────────────────────────────────────────
    format_file(filepath)

    print()
    print("Done.  Review the formatted file, symbol table, and backup.")


if __name__ == '__main__':
    main()
